import datetime
import time
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import plotly.io as pio
pio.renderers.default='notebook'
from greykite.algo.changepoint.adalasso.changepoint_detector import ChangepointDetector
from greykite.algo.forecast.silverkite.constants.silverkite_holiday import SilverkiteHoliday
from greykite.algo.forecast.silverkite.constants.silverkite_seasonality import SilverkiteSeasonalityEnum
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import cols_interact
from greykite.common import constants as cst
from greykite.common.features.timeseries_features import build_time_features_df
from greykite.common.features.timeseries_features import convert_date_to_continuous_time
from greykite.framework.benchmark.data_loader_ts import DataLoaderTS
from greykite.framework.input.univariate_time_series import UnivariateTimeSeries
from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})
plt.rcParams['figure.figsize'] = [15, 8]
# load bitcoin Close values from yahoo
start_date = '2016-05-29'
end_date = '2021-06-24'
btc_data = pdr.get_data_yahoo(['BTC-USD'],
start=start_date,
end=end_date)['Close'].reset_index()
# get rid of duplicates by using the max
btc_data = btc_data.groupby('Date')['BTC-USD'].max().reset_index()
The Greykite basic data loader
# read in the data to take a look
ts = UnivariateTimeSeries()
ts.load_data(
df=btc_data,
time_col="Date", # name of the time column
value_col="BTC-USD", # name of the value column
freq="D")
# describe
print(ts.describe_time_col())
print(ts.describe_value_col())
{'data_points': 1854, 'mean_increment_secs': 86400.0, 'min_timestamp': Timestamp('2016-05-28 00:00:00'), 'max_timestamp': Timestamp('2021-06-24 00:00:00')}
count 1844.000000
mean 10372.891871
std 12758.699774
min 526.232971
25% 3551.181030
50% 7277.461182
75% 10277.683350
max 63503.457031
Name: y, dtype: float64
The Greykite basic data plotting -- Interactive!!
fig = ts.plot()
pio.show(fig)
import warnings
warnings.filterwarnings("ignore")
The Greykite basic model config & plotting
t = time.time()
# Specifies dataset information
metadata = dict(
time_col="Date",
value_col="BTC-USD",
freq="D",
)
# define configureration of the model
config = ForecastConfig.from_dict(
dict(
model_template=ModelTemplateEnum.SILVERKITE.name,
forecast_horizon=30, # forecast 30 days
coverage=0.95, # 95% prediction intervals
metadata_param=metadata,
))
forecaster = Forecaster() # Creates forecasts and stores the result
result = forecaster.run_forecast_config( # result is also stored as `forecaster.forecast_result`.
df=btc_data,
config= config
)
time1 = round(time.time() - t)
print(f'Time taken: {time1}s')
Fitting 3 folds for each of 1 candidates, totalling 3 fits Time taken: 41s
forecast = result.forecast
fig = forecast.plot()
pio.show(fig)
fig = forecast.plot_components()
pio.show(fig)
Built in backtest, with a long list of metrics!
"run_forecast_config" has automatically performed Train-test-split. The test set have the same length as the forecast horizon, i.e. 30 days
backtest = result.backtest
fig = backtest.plot()
pio.show(fig)
result1 = pd.DataFrame(result.backtest.test_evaluation, index=["Value"]).transpose().rename(columns = {'Value': 'Result1'})
result1
| Result1 | |
|---|---|
| CORR | -0.146014 |
| R2 | -58.8182 |
| MSE | 2.72267e+08 |
| RMSE | 16500.5 |
| MAE | 16346 |
| MedAE | 15798.1 |
| MAPE | 44.7947 |
| MedAPE | 44.4366 |
| sMAPE | 28.9355 |
| Q80 | 13076.8 |
| Q95 | 15528.7 |
| Q99 | 16182.6 |
| OutsideTolerance1p | 1 |
| OutsideTolerance2p | 1 |
| OutsideTolerance3p | 1 |
| OutsideTolerance4p | 1 |
| OutsideTolerance5p | 1 |
| Outside Tolerance (fraction) | None |
| R2_null_model_score | None |
| Prediction Band Width (%) | 100.183 |
| Prediction Band Coverage (fraction) | 0.758621 |
| Coverage: Lower Band | 0 |
| Coverage: Upper Band | 0.758621 |
| Coverage Diff: Actual_Coverage - Intended_Coverage | -0.191379 |
Cross Validation
By default, run_forecast_config provides historical evaluation, so you can see how the forecast performs on past data. This is stored in grid_search (cross-validation splits)
def cv_metrisc(result, colname):
grid_search = result.grid_search
cv_results = summarize_grid_search_results(
grid_search=grid_search,
decimals=2,
# The below saves space in the printed output. Remove to show all available metrics and columns.
cv_report_metrics=None,
column_order=["rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params"])
# Transposes to save space in the printed output
cv_results["params"] = cv_results["params"].astype(str)
cv_results.set_index("params", drop=True, inplace=True)
cv_results.index=[colname]
cv_results1= cv_results.transpose()
return cv_results1
cv_results1 = cv_metrisc(result,'cv1')
cv_results1
| cv1 | |
|---|---|
| rank_test_MAPE | 1 |
| mean_test_MAPE | 46.09 |
| split_test_MAPE | (4.2, 72.28, 61.79) |
| mean_train_MAPE | 54.96 |
| split_train_MAPE | (2.06, 76.24, 86.6) |
| mean_fit_time | 6.31 |
| mean_score_time | 0.7 |
fig = ts.plot_quantiles_and_overlays(
groupby_time_feature="month_dom",
show_mean=True,
show_quantiles=False,
show_overlays=True,
overlay_label_time_feature="year",
overlay_style={"line": {"width": 1}, "opacity": 0.5},
center_values=True,
xlabel="day of year",
ylabel=ts.original_value_col,
title="yearly seasonality for each year (centered)",
)
pio.show(fig)
model = ChangepointDetector()
res = model.find_trend_changepoints(
df=btc_data, # data df
time_col="Date", # time column name
value_col="BTC-USD", # value column name
yearly_seasonality_order=1, # yearly seasonality order, fit along with trend
regularization_strength=0.4, # between 0.0 and 1.0, greater values imply fewer changepoints, and 1.0 implies no changepoints
resample_freq="7D", # data aggregation frequency, eliminate small fluctuation/seasonality
potential_changepoint_n=25, # the number of potential changepoints
actual_changepoint_min_distance = "30D",
#no_changepoint_distance_from_end = "30D"
)
fig = model.plot(
observation=True,
trend_estimate=False,
trend_change=True,
yearly_seasonality_estimate=True,
adaptive_lasso_estimate=True,
plot=False)
pio.show(fig)
res = model.find_seasonality_changepoints(
df=btc_data, # data df
time_col="Date", # time column name
value_col="BTC-USD",
regularization_strength=0.1)
pd.DataFrame(dict([(k, pd.Series(v)) for k, v in res["seasonality_changepoints"].items()])) # view result
| weekly | yearly | |
|---|---|---|
| 0 | NaN | 2018-11-20 |
| 1 | NaN | 2020-11-19 |
fig = model.plot(
seasonality_change=True, # detected seasonality change points, discussed in next section
seasonality_change_by_component=True, # plot seasonality by component (daily, weekly, etc.), discussed in next section
seasonality_estimate=True, # plot estimated trend+seasonality, discussed in next section
plot=False) # set to True to display the plot (need to import plotly interactive tool) or False to return the figure object
pio.show(fig)
# Factor in Elon's tweets
event_start_dates = ['2020-07-15','2020-12-21','2021-01-29', '2021-02-01','2021-02-08',
'2021-04-20','2021-05-12','2021-05-17','2021-05-19', '2021-05-24',
'2021-06-03']
daily_event_df_dict = {
"tweet": pd.DataFrame({
"date": event_start_dates,
"event_name": ["neg_tweet", "pos_tweet", "pos_tweet", "pos_tweet","pos_tweet",
"neg_tweet","neg_tweet","neg_tweet","neg_tweet","pos_tweet",
"neg_tweet"]
})
}
Model with Model Components defined
t = time.time()
# specify changepoint parameters in model_components
metadata = dict(
time_col="Date", # name of the time column
value_col="BTC-USD", # name of the value column
freq="D",
)
model_components = dict(
changepoints={
"changepoints_dict": {
"method": "auto",
"yearly_seasonality_order": 1,
"regularization_strength": 0.4,
"resample_freq": "7D",
"potential_changepoint_n": 25,
"actual_changepoint_min_distance": "30D",
"no_changepoint_distance_from_end" : "30D"
},
"seasonality_changepoints_dict": {
"regularization_strength": 0.1
}
},
events= {'daily_event_df_dict': daily_event_df_dict}
,
)
# Generates model config
config = ForecastConfig.from_dict(
dict(
model_template=ModelTemplateEnum.SILVERKITE.name,
forecast_horizon=30, # forecast 1 year
coverage=0.95, # 95% prediction intervals
metadata_param=metadata,
model_components_param=model_components
))
# Then run with changepoint parameters
forecaster = Forecaster()
result = forecaster.run_forecast_config(
df=btc_data,
config=config)
time2 = round(time.time() - t)
print(f'Time taken: {time2}s')
Fitting 3 folds for each of 1 candidates, totalling 3 fits Time taken: 185s
backtest = result.backtest
fig = backtest.plot()
pio.show(fig)
forecast = result.forecast
fig = forecast.plot()
pio.show(fig)
result2 = pd.DataFrame(result.backtest.test_evaluation, index=["Value"]).transpose().rename(columns = {'Value': 'Result2'})
result2
| Result2 | |
|---|---|
| CORR | 0.0993836 |
| R2 | -46.9868 |
| MSE | 2.18416e+08 |
| RMSE | 14778.9 |
| MAE | 14564.9 |
| MedAE | 14416.2 |
| MAPE | 39.9167 |
| MedAPE | 41.0095 |
| sMAPE | 25.0655 |
| Q80 | 11651.9 |
| Q95 | 13836.6 |
| Q99 | 14419.2 |
| OutsideTolerance1p | 1 |
| OutsideTolerance2p | 1 |
| OutsideTolerance3p | 1 |
| OutsideTolerance4p | 1 |
| OutsideTolerance5p | 1 |
| Outside Tolerance (fraction) | None |
| R2_null_model_score | None |
| Prediction Band Width (%) | 77.704 |
| Prediction Band Coverage (fraction) | 0.37931 |
| Coverage: Lower Band | 0 |
| Coverage: Upper Band | 0.37931 |
| Coverage Diff: Actual_Coverage - Intended_Coverage | -0.57069 |
cv_results2 = cv_metrisc(result,'cv2')
cv_results2
| cv2 | |
|---|---|
| rank_test_MAPE | 1 |
| mean_test_MAPE | 39.97 |
| split_test_MAPE | (3.04, 70.21, 46.66) |
| mean_train_MAPE | 31.31 |
| split_train_MAPE | (2.11, 68.33, 23.48) |
| mean_fit_time | 40.29 |
| mean_score_time | 1.07 |
t = time.time()
# specify changepoint parameters in model_components
metadata = dict(
time_col="Date", # name of the time column
value_col="BTC-USD", # name of the value column
freq="D",
)
model_components = dict(
custom = dict(
fit_algorithm_dict={
"fit_algorithm":"gradient_boosting",
}
)
)
# Generates model config
config = ForecastConfig.from_dict(
dict(
model_template=ModelTemplateEnum.SILVERKITE.name,
forecast_horizon=30, # forecast 1 year
coverage=0.95, # 95% prediction intervals
metadata_param=metadata,
model_components_param=model_components
))
# Then run with changepoint parameters
forecaster = Forecaster()
result = forecaster.run_forecast_config(
df=btc_data,
config=config)
time3 = round(time.time() - t)
print(f'Time taken: {time3}s')
Fitting 3 folds for each of 1 candidates, totalling 3 fits Time taken: 23s
backtest = result.backtest
fig = backtest.plot()
pio.show(fig)
forecast = result.forecast
fig = forecast.plot()
pio.show(fig)
result3 = pd.DataFrame(result.backtest.test_evaluation, index=["Value"]).transpose().rename(columns = {'Value': 'Result3'})
result3
| Result3 | |
|---|---|
| CORR | 0.0783112 |
| R2 | -4.15807 |
| MSE | 2.34774e+07 |
| RMSE | 4845.34 |
| MAE | 4292.18 |
| MedAE | 3846.23 |
| MAPE | 12.1985 |
| MedAPE | 10.425 |
| sMAPE | 5.64718 |
| Q80 | 858.437 |
| Q95 | 214.609 |
| Q99 | 42.9218 |
| OutsideTolerance1p | 1 |
| OutsideTolerance2p | 0.965517 |
| OutsideTolerance3p | 0.931034 |
| OutsideTolerance4p | 0.827586 |
| OutsideTolerance5p | 0.827586 |
| Outside Tolerance (fraction) | None |
| R2_null_model_score | None |
| Prediction Band Width (%) | 5.43911 |
| Prediction Band Coverage (fraction) | 0.0689655 |
| Coverage: Lower Band | 0.0689655 |
| Coverage: Upper Band | 0 |
| Coverage Diff: Actual_Coverage - Intended_Coverage | -0.881034 |
cv_results3 = cv_metrisc(result,'cv3')
cv_results3
| cv3 | |
|---|---|
| rank_test_MAPE | 1 |
| mean_test_MAPE | 9.04 |
| split_test_MAPE | (2.36, 8.16, 16.61) |
| mean_train_MAPE | 3.65 |
| split_train_MAPE | (0.26, 5.23, 5.45) |
| mean_fit_time | 2.9 |
| mean_score_time | 0.68 |
cv_results =cv_results1.join(cv_results2).join(cv_results3)
cv_results
| cv1 | cv2 | cv3 | |
|---|---|---|---|
| rank_test_MAPE | 1 | 1 | 1 |
| mean_test_MAPE | 46.09 | 39.97 | 9.04 |
| split_test_MAPE | (4.2, 72.28, 61.79) | (3.04, 70.21, 46.66) | (2.36, 8.16, 16.61) |
| mean_train_MAPE | 54.96 | 31.31 | 3.65 |
| split_train_MAPE | (2.06, 76.24, 86.6) | (2.11, 68.33, 23.48) | (0.26, 5.23, 5.45) |
| mean_fit_time | 6.31 | 40.29 | 2.9 |
| mean_score_time | 0.7 | 1.07 | 0.68 |
time_df = pd.DataFrame({'Result1': time1, 'Result2': time2, 'Result3': time3,}, index=['time'])
results = pd.concat([time_df, result1.join(result2).join(result3)])
results
| Result1 | Result2 | Result3 | |
|---|---|---|---|
| time | 41 | 185 | 23 |
| CORR | -0.146014 | 0.0993836 | 0.0783112 |
| R2 | -58.8182 | -46.9868 | -4.15807 |
| MSE | 2.72267e+08 | 2.18416e+08 | 2.34774e+07 |
| RMSE | 16500.5 | 14778.9 | 4845.34 |
| MAE | 16346 | 14564.9 | 4292.18 |
| MedAE | 15798.1 | 14416.2 | 3846.23 |
| MAPE | 44.7947 | 39.9167 | 12.1985 |
| MedAPE | 44.4366 | 41.0095 | 10.425 |
| sMAPE | 28.9355 | 25.0655 | 5.64718 |
| Q80 | 13076.8 | 11651.9 | 858.437 |
| Q95 | 15528.7 | 13836.6 | 214.609 |
| Q99 | 16182.6 | 14419.2 | 42.9218 |
| OutsideTolerance1p | 1 | 1 | 1 |
| OutsideTolerance2p | 1 | 1 | 0.965517 |
| OutsideTolerance3p | 1 | 1 | 0.931034 |
| OutsideTolerance4p | 1 | 1 | 0.827586 |
| OutsideTolerance5p | 1 | 1 | 0.827586 |
| Outside Tolerance (fraction) | None | None | None |
| R2_null_model_score | None | None | None |
| Prediction Band Width (%) | 100.183 | 77.704 | 5.43911 |
| Prediction Band Coverage (fraction) | 0.758621 | 0.37931 | 0.0689655 |
| Coverage: Lower Band | 0 | 0 | 0.0689655 |
| Coverage: Upper Band | 0.758621 | 0.37931 | 0 |
| Coverage Diff: Actual_Coverage - Intended_Coverage | -0.191379 | -0.57069 | -0.881034 |